import numpy as np
import random

K = 5 #五个摇臂定义
T = 100000   #总次数
value = np.array([[0,1,2],[0,5,7],[0,3,4],[0,6,8],[0,2,9]])
#每个摇臂里对应的所有可得硬币的个数
pro = np.array([[0.1,0.5,0.4],
              [0.3,0.3,0.4],
              [0.2,0.3,0.5],
              [0.4,0.5,0.1],
              [0.3,0.4,0.3]])
#每个摇臂里对应的所有可得硬币的概率
P=0  #初始化总值
Q=np.zeros(5)  #返回平均奖赏
N=np.zeros(5) #返回摇臂次数
e = 0.01 #参数
for i in range(T):
    if np.random.uniform()<e:
        k = np.random.choice(range(K))
    else:
        k = np.argmax(Q)
    R = np.random.choice(a=value[k], p=pro[k])
    P += R
    Q[k] = (Q[k] * N[k] + R) / (N[k] + 1)
    N[k] = N[k] + 1

print(P, Q, N)
# P,奖赏；Q,每个bei的平均奖；N，记录每个bei次数
